/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection

Created on: 12/28/2022
Last Modified on: 2/17/2024

Description: This program generates the investigator-level placement and outcome
rates, as well as standard errors for these estimates. Relative to our baseline 
specification, this program generate subsequent maltreatment rates *while in foster 
care* to assess whether investigators may make decisions based on maltreatment risk 
while in care. 

Note that we have removed the file directory names from this program for 
confidentiality reasons.
********************************************************************************/

**************************
**(0) SETUP
**************************
clear
set more off
macro drop all
capture log close

*Set directories 
global cleand 
global tmpd 
global data_raw

**************************
**(1) COMPUTE SUBSEQUENT MALTREATMENT RISK, INCLUDING SUBSEQUENT MALTREATMENT IN FC 
**************************
import delimited ${data_raw}allegations.csv, encoding(UTF-8) clear

*Keep the variables we care about:
**Flag allegations that did not result in formal investigations
gen cw_flag_screenedin=screeningdecision=="Accept and Assign for field investigation"
la var cw_flag_screenedin "Allegation was formally investigated"

**Flag children that were not the subject of the investigations
gen cw_flag_child_victim=child_role=="Alleged Victim (AV)"
la var cw_flag_child_victim "Child was alleged victim"
keep if cw_flag_child_victim==1 & cw_flag_screenedin==1

**Gen consistent investigation id throughout the sample period 
rename complaint_date cw_date
la var cw_date "CW- Allegation Report Date"
gen complaint_date = date(cw_date,"MDY")
replace intake_id = investigation_caseid if complaint_date<20933
rename (intakechildpartyid intake_id) (vicid inv_caseid) 

**Keep and rename relevant variables
keep vicid county_name complaint_date inv_caseid allegationtypedesc cw_flag* relationtypeperptovictim finding catdesc intakeperppartyid cw_date  

rename county_name cw_county
la var cw_county "CW- County of Investigation"

rename allegation cw_allegation
la var cw_allegation "CW- Allegation Type"

rename relationtypeperptovictim relationship 
la var relationship "CW - Relationship of alleged perp to child"

rename finding cw_sub
la var cw_sub "CW- Substantatiated"

*Gen foster care, substantiation, and foster care perp indicators 
gen fc = catdesc=="1"
replace fc=0 if cw_sub=="No Preponderance" | cw_sub=="No Evidence" | cw_sub==""

gen preponderance = cw_sub=="Preponderance"

gen foster = relationship=="Foster Brother" | relationship=="Foster Daughter" | relationship=="Foster Father" | relationship=="Foster Mother" | relationship=="Foster Sister" | relationship=="Foster Son" | relationship=="" //In the federal data, subsequent maltreatment while in foster care is incredibly rare. The question asks the following: Of all children who were in foster care during the year, what percentage were the subject of substantiated
// or indicated maltreatment by a foster parent or facility staff? The issue is that in our admin data, it can appear as if a subsequent investigation happens in foster care because new  reports get continuously added in the data to a focal investigation. To calculate maltreatment while in foster care, we then exploit the fact that we see who the alleged perp is. This allows us to get a measure of maltreatment while in foster care. We do this below. To be conservative, we include cases where the alleged victim is missing, since, according to the state, some of these could include foster care facilities. We note in the paper that the number we get from this exercise is very similar to federally reported data for the State of Michigan.
foreach x in foster fc preponderance {
	gegen tmp_`x' = max(`x'), by(vicid inv_caseid)
	drop `x'
	rename tmp_`x' `x'
}

gduplicates drop vicid inv_caseid, force 

*Gen variable: new investigation within six months (including any maltreatment while in foster care)
*For a given focal child X inv observation, did you have another investigation within 6 months?
cap drop inv6m 
foreach x in inv6m {
	gen `x'=.
}

sort vicid complaint_date inv_caseid, stable 
order inv6m   
bysort vicid: replace inv6m = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+180)

foreach x in inv6m {
	replace `x' = 0 if `x'==.
}

foreach x in inv6m {
	replace `x'=0 if (foster[_n+1]==0) & fc==1 //do not count as maltreatment if it happened while in foster care
	replace `x'=. if complaint_date+180>date("11/20/2019","MDY")
}

foreach x in 6 {
	label var inv`x'm "Subject of another investigation within `x' months"
}

rename inv6m inv6m_incl_fc 
keep vicid inv_caseid inv6m_incl_fc 
save "${tmpd}inv6m_incl_fc.dta", replace 



**************************
**(2) ESTIMATE INVESTIGATORS' PLACEMENT AND MALTREATMENT RATES CONDITIONAL ON THE CHILD BEING PLACED 
**************************
use "${cleand}analysis_sample_investigators_qje.dta", clear 

*Merge in the subsequent maltreatment outcome that accounts for maltreatment while in foster care 
cap drop _merge 
merge 1:1 vicid inv_caseid using "${tmpd}inv6m_incl_fc.dta", keepus(inv6m_incl_fc) keep(1 3)
replace inv6m_incl_fc=0 if inv6m_incl_fc==.
drop inv6m 
rename inv6m_incl_fc inv6m 

*Keep only variables of interest 
keep worker_id rotationgroup pre_black fc inv6m bshare zipcode_vic cps_year
rename pre_black black 
gen white = black==0 

**Estimate investigator placement and subsequent maltreatment rates
gen nofc=fc==0
egen grpworker_id = group(worker_id)
egen cell=group(zipcode_vic cps_year)
levelsof grpworker_id
local levels = "`r(levels)'"

* Loop over all the outcomes 
foreach var in inv6m {

	* For selectively observed vars, condition on *being placed in foster care* 
	if "`var'"!="nofc"{
		qui: reghdfe `var' i.grpworker_id if nofc == 0, resid absorb(cell)
	}
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen base_`var' = .
	gen se_`var' = .
	
	qui{
	foreach i in `levels' {
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		
		if _rc == 0 {
            replace base_`var' =  r(estimate) if grpworker_id == `i'
			replace se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop xbd xb d
	* By race (black and white)
	* For other outcomes, condition on being in fc
	if "`var'"!="nofc"{
		gen x = `var' if black == 1 | white == 1
		reghdfe x i.grpworker_id i.grpworker_id#i.black if nofc==0, resid absorb(cell)
	}
	
	
	* retrieve average of fixed effect
	predict xbd, xbd
	predict xb, xb
	gen d = xbd - xb 
	sum d 
	local exp = r(mean)
	
	gen b_`var' = .
	gen b_se_`var' = .
	gen w_`var' = .
	gen w_se_`var' = .
   
   	* Quietly
	qui {
	* White children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + `exp'
		if _rc == 0 {
            replace w_`var' =  r(estimate) if grpworker_id == `i'
			replace w_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	* Black children
	foreach i in `levels'{
		capture: lincom _cons + _b[`i'.grpworker_id] + _b[`i'.grpworker_id#1.black] + `exp'
		if _rc == 0 {
            replace b_`var' =  r(estimate) if grpworker_id == `i'
			replace b_se_`var'= r(se) if  grpworker_id==`i'
		}
	}
	}
	drop x xbd xb d
}

*----------------------------------------------------------------------*
* Save investigator-level dataset 
*----------------------------------------------------------------------*
bys worker_id: gen count_inv = _N
bys worker_id: egen count_black = total(black)
bys worker_id : egen count_white = total(white)

* Keep relevant vars
keep base_* b_* w_* worker_id count_inv count_black count_white grpworker_id
duplicates drop worker_id, force 	
drop if worker_id == .

save "${tmpd}infc_maltreatment_rates_qje.dta", replace